suppressMessages(
c(library(scater),
library(Seurat),
library(tidyverse),
library(cowplot),
library(Matrix.utils),
library(edgeR),
library(dplyr),
library(magrittr),
library(Matrix),
library(purrr),
library(reshape2),
library(S4Vectors),
library(tibble),
library(SingleCellExperiment),
library(pheatmap),
library(apeglm),
library(png),
library(DESeq2),
library(RColorBrewer),
library(mixOmics),
library(knitr),
library(mosaic),
library(gplots),
library(clusterProfiler),
library(gridExtra),
library(GSVA),
library(RANN))
)
## [1] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [6] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [11] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [16] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [21] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [26] "forcats" "stringr" "dplyr" "purrr" "readr"
## [31] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [36] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [41] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [46] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [51] "grDevices" "utils" "datasets" "methods" "base"
## [56] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [61] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [66] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [71] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [76] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [81] "forcats" "stringr" "dplyr" "purrr" "readr"
## [86] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [91] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [96] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [101] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [106] "grDevices" "utils" "datasets" "methods" "base"
## [111] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [116] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [121] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [126] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [131] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [136] "forcats" "stringr" "dplyr" "purrr" "readr"
## [141] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [146] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [151] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [156] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [161] "grDevices" "utils" "datasets" "methods" "base"
## [166] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [171] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [176] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [181] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [186] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [191] "forcats" "stringr" "dplyr" "purrr" "readr"
## [196] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [201] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [206] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [211] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [216] "grDevices" "utils" "datasets" "methods" "base"
## [221] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [226] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [231] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [236] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [241] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [246] "forcats" "stringr" "dplyr" "purrr" "readr"
## [251] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [256] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [261] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [266] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [271] "grDevices" "utils" "datasets" "methods" "base"
## [276] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [281] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [286] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [291] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [296] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [301] "forcats" "stringr" "dplyr" "purrr" "readr"
## [306] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [311] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [316] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [321] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [326] "grDevices" "utils" "datasets" "methods" "base"
## [331] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [336] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [341] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [346] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [351] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [356] "forcats" "stringr" "dplyr" "purrr" "readr"
## [361] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [366] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [371] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [376] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [381] "grDevices" "utils" "datasets" "methods" "base"
## [386] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [391] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [396] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [401] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [406] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [411] "forcats" "stringr" "dplyr" "purrr" "readr"
## [416] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [421] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [426] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [431] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [436] "grDevices" "utils" "datasets" "methods" "base"
## [441] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [446] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [451] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [456] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [461] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [466] "forcats" "stringr" "dplyr" "purrr" "readr"
## [471] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [476] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [481] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [486] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [491] "grDevices" "utils" "datasets" "methods" "base"
## [496] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [501] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [506] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [511] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [516] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [521] "forcats" "stringr" "dplyr" "purrr" "readr"
## [526] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [531] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [536] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [541] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [546] "grDevices" "utils" "datasets" "methods" "base"
## [551] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [556] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [561] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [566] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [571] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [576] "forcats" "stringr" "dplyr" "purrr" "readr"
## [581] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [586] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [591] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [596] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [601] "grDevices" "utils" "datasets" "methods" "base"
## [606] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [611] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [616] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [621] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [626] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [631] "forcats" "stringr" "dplyr" "purrr" "readr"
## [636] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [641] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [646] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [651] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [656] "grDevices" "utils" "datasets" "methods" "base"
## [661] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [666] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [671] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [676] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [681] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [686] "forcats" "stringr" "dplyr" "purrr" "readr"
## [691] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [696] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [701] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [706] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [711] "grDevices" "utils" "datasets" "methods" "base"
## [716] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [721] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [726] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [731] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [736] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [741] "forcats" "stringr" "dplyr" "purrr" "readr"
## [746] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [751] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [756] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [761] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [766] "grDevices" "utils" "datasets" "methods" "base"
## [771] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [776] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [781] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [786] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [791] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [796] "forcats" "stringr" "dplyr" "purrr" "readr"
## [801] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [806] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [811] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [816] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [821] "grDevices" "utils" "datasets" "methods" "base"
## [826] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [831] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [836] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [841] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [846] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [851] "forcats" "stringr" "dplyr" "purrr" "readr"
## [856] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [861] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [866] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [871] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [876] "grDevices" "utils" "datasets" "methods" "base"
## [881] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [886] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [891] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [896] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [901] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [906] "forcats" "stringr" "dplyr" "purrr" "readr"
## [911] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [916] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [921] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [926] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [931] "grDevices" "utils" "datasets" "methods" "base"
## [936] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [941] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [946] "mixOmics" "lattice" "MASS" "RColorBrewer" "DESeq2"
## [951] "png" "apeglm" "pheatmap" "reshape2" "magrittr"
## [956] "edgeR" "limma" "Matrix.utils" "Matrix" "cowplot"
## [961] "forcats" "stringr" "dplyr" "purrr" "readr"
## [966] "tidyr" "tibble" "tidyverse" "Seurat" "scater"
## [971] "ggplot2" "SingleCellExperiment" "SummarizedExperiment" "DelayedArray" "matrixStats"
## [976] "Biobase" "GenomicRanges" "GenomeInfoDb" "IRanges" "S4Vectors"
## [981] "BiocGenerics" "parallel" "stats4" "stats" "graphics"
## [986] "grDevices" "utils" "datasets" "methods" "base"
## [991] "RANN" "GSVA" "gridExtra" "clusterProfiler" "gplots"
## [996] "mosaic" "mosaicData" "ggformula" "ggstance" "knitr"
## [ reached getOption("max.print") -- omitted 485 entries ]
# Bring in Seurat object
seurat <- readRDS('Results/seurat_labelled.rds')
# Extract raw counts and metadata to create SingleCellExperiment object
counts <- seurat@assays$RNA@counts
metadata <- seurat@meta.data
# Set up metadata as desired for aggregation and DE analysis
metadata$cluster_id <- factor(seurat@active.ident)
# Create single cell experiment object
sce <- SingleCellExperiment(assays = list(counts = counts),
colData = metadata)
pdf('Figure/cell_cluster.pdf',
width = 10,
height = 7)
DimPlot(seurat,
reduction = "umap",
label = TRUE,
label.size = 4,
repel = TRUE)
dev.off()
## RStudioGD
## 2
pdf('Figure/cell_cluster_by_condition.pdf',
width = 12,
height = 7)
DimPlot(seurat,
reduction = "umap",
label = TRUE,
label.size = 4,
repel = TRUE,
split.by = "condition") + NoLegend()
dev.off()
## RStudioGD
## 2
# Checj the number of cells from each sample in each cluster
options(width = 100)
table(sce$cluster_id, sce$sample)
##
## Homeostasis-M1 Homeostasis-M2 Homeostasis-M3 HPDay4-M1 HPDay4-M2 HPDay4-M3
## CD4 105 153 127 19 56 13
## CD8 106 64 407 1572 750 1522
## Plasma cell 11 15 1 139 223 159
## DC 59 103 69 151 169 185
## Inflammatory Monocytes 8 5 19 101 115 75
## Macrophage (M2) 105 130 76 23 25 35
## NK 17 9 27 18 20 21
## Neutrophil 6 2 7 41 46 84
## Epithelial 6 5 29 6 8 11
## Monocyte (cycling) 35 46 18 27 84 37
## B cell 998 1270 2485 0 1 2
## pDC 8 11 16 0 0 0
cell_distribution <- as.data.frame.matrix(table(sce$cluster_id, sce$sample))
total_num <- colSums(cell_distribution)
df <- melt(as.matrix(cell_distribution), value.name = "cell_num")
df$Var1 <- factor(df$Var1)
colnames(df)[1:2] <- c("Cell_Type", "Sample")
ggplot(df, aes(x = Sample, y = cell_num, fill = Cell_Type)) +
geom_bar(position = "fill", stat="identity") + ggtitle("Cell Number Distribution")
pdf('Figure/cell_distribution.pdf',
width = 10,
height = 7)
ggplot(df, aes(x = Sample, y = cell_num, fill = Cell_Type)) +
geom_bar(position = "fill", stat="identity") + ggtitle("Cell Number Distribution")
dev.off()
## RStudioGD
## 2
I actually cannot run DESeq on this dataset because the gene expression counts are already normalized so therefore not integers. I cannot load them into DESeq2.
features <- c("Raf1", "Grk2", "Akt1", "Cdk1", "Csnk2b", "Mknk1", "Myd88", "Nfkbib", "Arhgdia", "Rac1", "Ripk1", "Stat2", "Tnfrsf1a", "Vav2", "Ywhaz", "Pdk1", "Trib3", "Dapp1", "Map2k3", "Mapk8", "Tbk1", "Ikbke", "Gsk3a", "Actr3", "Ap2m1", "Ptpn6")
DotPlot(seurat, features = features, cols = "RdBu") + RotatedAxis() + coord_flip()
## Warning in FetchData(object = object, vars = features): The following requested variables were not
## found: Grk2
pdf('Figure/mTOR_leadingedge_Dotplot.pdf',
width = 6,
height = 8)
DotPlot(seurat, features = features, cols = "RdBu") + RotatedAxis() + coord_flip()
## Warning in FetchData(object = object, vars = features): The following requested variables were not
## found: Grk2
dev.off()
## RStudioGD
## 2
VlnPlot(object = seurat,
features = features,
pt.size = 0.1,
ncol = 4)
## Warning in FetchData(object = object, vars = features, slot = slot): The following requested
## variables were not found: Grk2
The GSAV analysis requires input of a expression matrix object and a list of gene sets.
The genesets here are Broad MsigDB genesets converted to mouse Entrez ID. These genesets are downloaded from here.
GSVA is only done on the Hallmark geneset.
DefaultAssay(seurat) <- "RNA"
# Prepare the normalized count matrix
# This count matrix is log transformed so we will need to use "Gaussian" distribution for GSVA
counts <- as.matrix(seurat@assays$RNA@data)
# convert gene identifiers to Entrez ID
suppressMessages(suppressWarnings(id_list <- read_csv("Data/mouse_symbol_Ensembl.csv")))
counts <- rownames_to_column(as.data.frame(counts), var = "gene_name")
counts <- inner_join(counts, id_list[,c(2:3)], by = c("gene_name" = "Marker Symbol"))
counts$gene_name <- NULL
non_duplicate_id <- which(duplicated(counts$"EntrezGene ID") == FALSE)
counts <- counts[non_duplicate_id,]
rownames(counts) <- counts$"EntrezGene ID"
counts$"EntrezGene ID" <- NULL
counts <- as.matrix(counts)
saveRDS(counts, "Data/counts.rds")
# Prepare the genesets
#load("Data/GeneSet/mouse_H_v5p2.rdata")
#load("Data/GeneSet/mouse_c2_v5p2.rdata")
#load("Data/GeneSet/mouse_c6_v5p2.rdata")
# filter the C2 genesets to only canonical pathways from BIOCARTA, KEGG, PID, and REACTOME
#Mm.c2.canonical <- Mm.c2[c(grep("^KEGG", names(Mm.c2)),
#+ grep("^REACTOME", names(Mm.c2)),
#+ grep("^BIOCARTA", names(Mm.c2)),
#+ grep("^PID", names(Mm.c2)))]
#save(Mm.c2.canonical, file = "Data/GeneSet/mouse_c2_canonical.rdata")
#load("Data/GeneSet/mouse_c2_canonical.rdata")
# concocanate a master list genesets that I want to test for
#Mm.gsva.master <- c(Mm.H, Mm.c2.canonical, Mm.c6)
#save(Mm.gsva.master, file = "Data/GeneSet/mouse_gsva_master.rdata")
load("Data/GeneSet/mouse_H_v5p2.rdata")
Now let's run GSVA. A Gaussian kernal was used for this since the input counts were natural log normalized from Seurat.
The ECDF estimatation based on the Gaussuan kernal takes forever to run on a local machine. So the actual GSVA is run on the O2 cluster.
Here is the R code used on O2.
# This part is not run in the markdown
library(GSVA)
load("mouse_H_v5p2.rdata")
counts <- readRDS("counts.rds")
gsva_out <- gsva(counts, Mm.H, min.sz=5, max.sz=500,
kcdf="Gaussian", mx.diff=TRUE, verbose=TRUE, parallel.sz=5)
saveRDS(gsva_out, "gsva_out.rds")
Here is the script for submitting the O2 job. I used 5 cores for parallel computing.
#!/bin/bash
#SBATCH -p priority
#SBATCH -t 0-12:00
#SBATCH --mem 100G
#SBATCH -c 5
#SBATCH -e out/ca.%j.err
#SBATCH -o out/ca.%j.out
module load gcc/6.2.0 R/3.6.1
Rscript GSVA.R
Here we have the output from O2 cluster and load it here and merge it into metadata for later analysis.
gsva_res <- t(readRDS("Results/gsva_out.rds"))
seurat@meta.data <- cbind(seurat@meta.data, gsva_res)
write_rds(seurat,
path = "Results/seurat_gsva.rds")
Just some exploratory analysis using some Hallmark gene sets.
FeaturePlot(object = seurat,
features = c("HALLMARK_MTORC1_SIGNALING", "HALLMARK_PI3K_AKT_MTOR_SIGNALING", "HALLMARK_MYC_TARGETS_V1", "HALLMARK_MYC_TARGETS_V2", "HALLMARK_KRAS_SIGNALING_DN"),
order = TRUE,
min.cutoff = 'q10',
label = FALSE,
repel = TRUE,
ncol = 3)
pdf('Figure/GSVA_TCT-specific.pdf',
width = 20,
height = 10)
FeaturePlot(object = seurat,
features = c("HALLMARK_MTORC1_SIGNALING", "HALLMARK_PI3K_AKT_MTOR_SIGNALING", "HALLMARK_MYC_TARGETS_V1", "HALLMARK_MYC_TARGETS_V2", "HALLMARK_KRAS_SIGNALING_DN"),
order = TRUE,
min.cutoff = 'q10',
label = FALSE,
repel = TRUE,
ncol = 3)
dev.off()
## RStudioGD
## 2
# creat a new annotation column for inflammatory monocytes and all other cells
seurat@meta.data$inf.monocyte <- "other cells"
seurat@meta.data$inf.monocyte[grep("Inflammatory Monocytes", seurat@meta.data$orig.annotation)] <- "Inflammatory Monocytes"
seurat@meta.data$inf.monocyte[grep("Neutrophil", seurat@meta.data$orig.annotation)] <- "Neutrophil"
Idents(object = seurat) <- "inf.monocyte"
# violin plot
VlnPlot(object = seurat,
features = c("HALLMARK_MTORC1_SIGNALING"),
pt.size = 0) + geom_boxplot(width = 0.1)
pdf('Figure/GSVA_mTORC1_violin.pdf',
width = 10,
height = 7)
VlnPlot(object = seurat,
features = c("HALLMARK_MTORC1_SIGNALING"),
pt.size = 0) + geom_boxplot(width = 0.1)
dev.off()
## RStudioGD
## 2
# Stats
NES <- seurat@meta.data$HALLMARK_MTORC1_SIGNALING
NES_inf_mono <- NES[grep("Inflammatory Monocytes", seurat@meta.data$inf.monocyte)]
NES_neutrophil <- NES[grep("Neutrophil", seurat@meta.data$inf.monocyte)]
NES_other <- NES[grep("other cells", seurat@meta.data$inf.monocyte)]
## check if NES is normally distributed
qqnorm(NES); qqline(NES)
## data is relatively normal, but I am using a non-parametric method here just to be safe
stats::wilcox.test(NES_inf_mono, NES_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: NES_inf_mono and NES_other
## W = 3159338, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
stats::wilcox.test(NES_neutrophil, NES_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: NES_neutrophil and NES_other
## W = 915910, p-value = 0.9999
## alternative hypothesis: true location shift is greater than 0
# violin plot
VlnPlot(object = seurat,
features = c("HALLMARK_PI3K_AKT_MTOR_SIGNALING"),
pt.size = 0) + geom_boxplot(width = 0.1)
pdf('Figure/GSVA_PI3K_mTOR_violin.pdf',
width = 10,
height = 7)
VlnPlot(object = seurat,
features = c("HALLMARK_PI3K_AKT_MTOR_SIGNALING"),
pt.size = 0) + geom_boxplot(width = 0.1)
dev.off()
## RStudioGD
## 2
# Stats
NES <- seurat@meta.data$HALLMARK_PI3K_AKT_MTOR_SIGNALING
NES_inf_mono <- NES[grep("Inflammatory Monocytes", seurat@meta.data$inf.monocyte)]
NES_neutrophil <- NES[grep("Neutrophil", seurat@meta.data$inf.monocyte)]
NES_other <- NES[grep("other cells", seurat@meta.data$inf.monocyte)]
## check if NES is normally distributed
qqnorm(NES); qqline(NES)
## data is relatively normal, but I am using a non-parametric method here just to be safe
stats::wilcox.test(NES_inf_mono, NES_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: NES_inf_mono and NES_other
## W = 2910337, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
stats::wilcox.test(NES_neutrophil, NES_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: NES_neutrophil and NES_other
## W = 1494134, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
FeaturePlot(object = seurat,
features = c("HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION", "HALLMARK_APICAL_JUNCTION", "HALLMARK_PROTEIN_SECRETION", "HALLMARK_ANDROGEN_RESPONSE", "HALLMARK_P53_PATHWAY"),
order = TRUE,
min.cutoff = 'q10',
label = FALSE,
repel = TRUE,
ncol = 3)
I will be using the method described in the Biton et al., 2018 paper.
"To score a specific set of n genes in a given cell, a ‘background’ gene set was defined to control for differences in sequencing coverage and library complexity between cells (Kowalczyk et al., 2015). The background gene set was selected to be similar to the genes of interest in terms of expression level. Specifically, the 10n nearest gene neighbors in the 2-D space defined by mean expression and detection frequency across all cells were selected. The signature score for that cell was then defined as the mean expression of the n signature genes in that cell, minus the mean expression of the 10n background genes in that cell."
First I need to acquire gene level metadata to generate the 2-D space for each gene. The 2 dimensions needed here are mean expression and detection frequency.
counts <- seurat@assays$RNA@counts
counts_df <- as.data.frame(counts)
gene_metadata <- as.data.frame(cbind(rownames(counts_df), rowMeans(counts_df), rowSums(!counts_df == 0)/dim(counts_df)[2]))
gene_metadata$V1 <- NULL
colnames(gene_metadata) <- c("mean_expression", "detection_freq")
# index the gene list of interest to get their coord on the 2-D space
gene_index <- match(features, rownames(gene_metadata))
# remove NA
gene_index <- gene_index[!is.na(gene_index)]
feature_df <- gene_metadata[gene_index,]
other_df <- gene_metadata[-gene_index,]
# find 10n nearest neighbor of these features in the background list
bg.gene <- nn2(other_df, feature_df, k = 10)
bg.gene.index <- unique(as.vector(bg.gene$nn.idx))
bg.gene.df <- other_df[bg.gene.index,]
# now we have the gene signature of interest and the background gene list
# signature score for each cell is the mean expression of the signature genes minus the mean expression of the background genes
bg.index <- match(rownames(bg.gene.df), rownames(counts_df))
sig.df <- counts_df[gene_index, ]
bg.df <- counts_df[bg.index,]
sig.score <- colMeans(sig.df) - colMeans(bg.df)
seurat@meta.data$"mTOR_leading_edge_sig_score" <- sig.score
write_rds(seurat,
path = "Results/seurat_gsva.rds")
FeaturePlot(object = seurat,
features = c("mTOR_leading_edge_sig_score"),
order = TRUE,
min.cutoff = 'q10',
label = FALSE,
repel = TRUE)
pdf('Figure/mTOR_leading_edge_signature.pdf',
width = 7,
height = 5)
FeaturePlot(object = seurat,
features = c("mTOR_leading_edge_sig_score"),
order = TRUE,
min.cutoff = 'q10',
label = FALSE,
repel = TRUE)
dev.off()
## RStudioGD
## 2
pdf('Figure/mTOR_leading_edge_signature_Voilin.pdf',
width = 10,
height = 7)
# violin plot
VlnPlot(object = seurat,
features = c("mTOR_leading_edge_sig_score"),
pt.size = 0) + geom_boxplot(width = 0.1)
dev.off()
## RStudioGD
## 2
# Stats
sig.score_inf_mono <- sig.score[grep("Inflammatory Monocytes", seurat@meta.data$inf.monocyte)]
sig.score_neutrophil <- sig.score[grep("Neutrophil", seurat@meta.data$inf.monocyte)]
sig.score_other <- sig.score[grep("other cells", seurat@meta.data$inf.monocyte)]
## check if sig.score is normally distributed
qqnorm(sig.score); qqline(sig.score)
## data is relatively normal, but I am using a non-parametric method here just to be safe
stats::wilcox.test(sig.score_inf_mono, sig.score_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: sig.score_inf_mono and sig.score_other
## W = 3289766, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0
stats::wilcox.test(sig.score_neutrophil, sig.score_other, alternative = c("greater"), paired = FALSE)
##
## Wilcoxon rank sum test with continuity correction
##
## data: sig.score_neutrophil and sig.score_other
## W = 1959241, p-value < 2.2e-16
## alternative hypothesis: true location shift is greater than 0